package com.kdtech.analyse.Bbs;
import com.alibaba.fastjson.JSONObject;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.DoMainUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.NumberUtils;
import com.kdtech.utils.StringUtils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;

/**
 * http://bbs.jrj.com.cn/ 金融界股票论坛
 * @author 小聂
 */
public class JrjBbsAnalyse implements AnalyseNews{

	
	public boolean isDetailPage(String url) {
		boolean bRet = false;
		String[] regex = {
				"http://bbs.jrj.com.cn/msg,[0-9]*.html",
				"http://bbs.jrj.com.cn/msg%2C[0-9]*.html",
				"http://istock.jrj.com.cn/article,[0-9]*,[0-9]*.html",
				"http://itougu.jrj.com.cn/view/[0-9]*.jspa[?]tgqdcode=4BUD6HJ8"
				};
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta bbs = new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt = urlMeta.getHtml();
		String url = urlMeta.getUrl();
		bbs.setUrl(url);
		String title = null;
		Long date = null;
		String content = null;
		String dateStr = null;
		String autohr = null;
		Document doc = Jsoup.parse(htmltxt);
		if(doc.select("html body div.wrap div.infor").size()>0){
		}
		title = doc.select("div.tit h1").text();
		if(StringUtils.isBlank(title)){
			title = doc.select("h3.tit").text();
		}
		if(StringUtils.isBlank(title)){
			title = doc.select("h1").text();
		}
		autohr = doc.select("p.name b").text();
		dateStr = doc.select("html body div#topIndex.Bbs_Thread div.author p.name span.fr").text();
		content = HtmlCleaner.getContentHtml(url, doc.select("#msgMainContent"));
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url, doc.select("div.main"));
		}
		date = DateUtils.matchDate(dateStr);

		Element firstDiv = doc.select("div[class^=lou]").first();
		if(firstDiv!=null){
			if(StringUtils.isBlank(autohr)){
				autohr = firstDiv.select("p.title a").text();
			}
			if(StringUtils.isBlank(content)){
				content = HtmlCleaner.getContentHtml(url,  firstDiv.select("div.content"));
			}
			if(date==null){
				date = DateUtils.matchDate(firstDiv.select("p.title").text());
			}
			String text = firstDiv.select("p.title").text();
			Integer commentNum = NumberUtils.matchNumber(StringUtils.substringBetween(text, "回复数", "举报"));
			bbs.setCommentNum(commentNum);
		}
		if(date==null){
			date = DateUtils.matchDate(doc.select("p.tc").text());
		}
		bbs.setContent(content);
		bbs.setDate(date);
		bbs.setTitle(title);
		bbs.setType(4);
		bbs.setAuthor(autohr);

		if(StringUtils.isBlank(title)){
		}

		/**
		 * 更新
		 */
		Pattern reg = Pattern.compile("msg(,[0-9]+)+");
		Matcher matcher = reg.matcher(url);
		String updateUrl = null;
		if (matcher.find()) {
			String id = matcher.group();
			id = id.replaceAll("msg,", "");
			if(id!=null){
				updateUrl = "http://bbs.jrj.com.cn/article/getMainMsgInfo.jspa?msgid="+id+"&cid=1&tmp=1354259416492";
				bbs.setUpdateUrl(updateUrl);
				NewsMeta update = Update(bbs);
				 if(update!=null){
					 bbs.setCommentNum(update.getCommentNum());
					 bbs.setClickNum(update.getClickNum());
				 }
			}
		}
		if(url.startsWith("http://istock.jrj.com.cn/")){
			String forumid = StringUtils.substringBetween(url, ",", ",");
			String topicid = StringUtils.substringAfterLast(url, ",");
			topicid = StringUtils.substringBefore(topicid, ".");
			updateUrl = "http://istock.jrj.com.cn/topicgetclick.jspa?forumid="+forumid+"&topicid="+topicid;
			UrlMeta responseToURL = CrawlHTML.responseToURL(updateUrl);
			if(responseToURL!=null){
				String html = responseToURL.getHtml();
				Integer clickNum = NumberUtils.matchNumber(StringUtils.substringBetween(html, "'", "'"));
				bbs.setClickNum(clickNum);
			}
		}
		return bbs;
	}

	
	public NewsMeta Update(NewsMeta meta) {
		try {
			if(meta!=null){
				String updateUrl = meta.getUpdateUrl();
				if(updateUrl!=null){
					UrlMeta responseToURL = CrawlHTML.responseToURL(updateUrl);
					if(responseToURL!=null){
						String responseStr = responseToURL.getHtml();
						JSONObject json = JSONObject.parseObject(responseStr);
						if(responseStr!=null){
							int commentnum = json.getIntValue("replynum");
							int clicknum = json.getIntValue("pv");
							meta.setCommentNum(commentnum);
							meta.setClickNum(clicknum);

							return meta;
						}
					}
				}
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}

	
	
}
